# Read the datasetvac_df <-read_csv("Datasets/us_state_vaccinations.csv")# Select relevant columnscols_show <-c('date', 'location', 'daily_vaccinations_per_million', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred')t <- vac_df[, cols_show]# Group by 'date' and summarize columns, ignoring NA valuest1 <- t %>%group_by(date) %>%summarize(daily_vaccinations_per_million =sum(daily_vaccinations_per_million, na.rm =TRUE),people_vaccinated_per_hundred =mean(people_vaccinated_per_hundred, na.rm =TRUE),people_fully_vaccinated_per_hundred =mean(people_fully_vaccinated_per_hundred, na.rm =TRUE) )# Convert date column to Date formatt1$date <-as.Date(t1$date)# Aggregate data to monthly level using mean for each columnt1 <- t1 %>%group_by(date =format(date, "%Y-%m")) %>%summarize(daily_vaccinations_per_million =mean(daily_vaccinations_per_million, na.rm =TRUE),people_vaccinated_per_hundred =mean(people_vaccinated_per_hundred, na.rm =TRUE),people_fully_vaccinated_per_hundred =mean(people_fully_vaccinated_per_hundred, na.rm =TRUE))# Transform the date column type with specified formatt1$date <-as.Date(paste0(t1$date, "-01-01"))# Visualize the plotd_vacc_ts <-ts(t1$daily_vaccinations_per_million, start =c(year(min(t1$date)), month(min(t1$date))), end =c(year(max(t1$date)), month(max(t1$date))), frequency =12)pe<-autoplot(d_vacc_ts, xlab ="Time", ylab ="Daily Vaccinations per Million", colour ="#5a3196")+ggtitle('Time Series Plot of Daily Vaccinations per Million in the US')+theme_bw()ggplotly(pe)
Code
# Visualize the plotp_vacc_ts <-ts(t1$people_vaccinated_per_hundred, start =c(year(min(t1$date)), month(min(t1$date))), end =c(year(max(t1$date)), month(max(t1$date))), frequency =12)pe<-autoplot(p_vacc_ts, xlab ="Time", ylab ="People Vaccinated per Hundred", colour ="#5a3196")+ggtitle('Time Series Plot of People Vaccinated per Hundred in the US')+theme_bw()ggplotly(pe)
Code
# Visualize the plotpf_vacc_ts <-ts(t1$people_fully_vaccinated_per_hundred, start =c(year(min(t1$date)), month(min(t1$date))), end =c(year(max(t1$date)), month(max(t1$date))), frequency =12)pe<-autoplot(pf_vacc_ts, xlab ="Time", ylab ="People Fully Vaccinated per Hundred", colour ="#5a3196")+ggtitle('Time Series Plot of People Fully Vaccinated per Hundred in the US')+theme_bw()ggplotly(pe)
Code
# Newly confirmed caseswide_data <-read_csv("Datasets/covid_confirmed_usafacts.csv")# Define the key and value columns for pivotingkey_cols <-c("countyFIPS", "County Name", "State", "StateFIPS")value_cols <-setdiff(names(wide_data), key_cols)# Pivot the data from wide to longlong_data <-pivot_longer( wide_data,cols = value_cols,names_to ="date",values_to ="value")# Group by 'State' and 'date', and calculate the sum of Confirmed Casescon_case_df <- long_data %>%group_by(date) %>%summarize(value_sum =sum(value, na.rm =TRUE))# Convert date column to Date formatcon_case_df$date <-as.Date(con_case_df$date)# Aggregate data to monthly level using mean for each columncon_case_df <- con_case_df %>%group_by(date =format(date, "%Y-%m")) %>%summarize(value_sum =mean(value_sum, na.rm =TRUE))# Transform the date column type with specified formatcon_case_df$date <-as.Date(paste0(con_case_df$date, "-01-01"))# Visualize the plotcase_ts <-ts(con_case_df$value_sum, start =c(year(min(con_case_df$date)), month(min(con_case_df$date))), end =c(year(max(con_case_df$date)), month(max(con_case_df$date))), frequency =12)pe<-autoplot(case_ts, xlab ="Time", ylab ="Newly Confirmed Cases", colour ="#5a3196")+ggtitle('Time Series Plot of Newly Confirmed COVID Cases in the US')+theme_bw()ggplotly(pe)
Code
# Death caseswide_data <-read_csv("Datasets/covid_deaths_usafacts.csv")# Define the key and value columns for pivotingkey_cols <-c("countyFIPS", "County Name", "State", "StateFIPS")value_cols <-setdiff(names(wide_data), key_cols)# Pivot the data from wide to longlong_data <-pivot_longer( wide_data,cols = value_cols,names_to ="date",values_to ="value")# Group by 'State' and 'date', and calculate the sum of Confirmed Casesdead_case_df <- long_data %>%group_by(date) %>%summarize(value_sum =sum(value, na.rm =TRUE))# Convert date column to Date formatdead_case_df$date <-as.Date(dead_case_df$date)# Aggregate data to monthly level using mean for each columndead_case_df <- dead_case_df %>%group_by(date =format(date, "%Y-%m")) %>%summarize(value_sum =mean(value_sum, na.rm =TRUE))# Transform the date column type with specified formatdead_case_df$date <-as.Date(paste0(dead_case_df$date, "-01-01"))# Visualize the plotdead_ts <-ts(dead_case_df$value_sum, start =c(year(min(dead_case_df$date)), month(min(dead_case_df$date))), end =c(year(max(dead_case_df$date)), month(max(dead_case_df$date))), frequency =12)pe<-autoplot(dead_ts, xlab ="Time", ylab ="Dead Cases", colour ="#5a3196")+ggtitle('Time Series Plot of Dead COVID Cases in the US')+theme_bw()ggplotly(pe)
Code
hos_df <-read_csv('Datasets/COVID-19_hos.csv')# data glimpsecols_show <-c('state', 'date', 'inpatient_beds', 'inpatient_beds_used_covid', 'inpatient_bed_covid_utilization')t <- hos_df[, cols_show]# Group by 'date', and calculate the sum of Confirmed Caseshos <- t %>%group_by(date) %>%summarize(value_sum1 =sum(inpatient_beds, na.rm =TRUE),value_sum2 =sum(inpatient_beds_used_covid, na.rm =TRUE),value_sum3 =mean(inpatient_bed_covid_utilization, na.rm =TRUE))# Convert date column to Date formathos$date <-as.Date(hos$date)# Aggregate data to monthly level using mean for each columnhos <- hos %>%group_by(date =format(date, "%Y-%m")) %>%summarize(value_sum1 =mean(value_sum1, na.rm =TRUE),value_sum2 =mean(value_sum2, na.rm =TRUE),value_sum3 =mean(value_sum3, na.rm =TRUE))# Transform the date column type with specified formathos$date <-as.Date(paste0(hos$date, "-01-01"))# Visualize the plothos_ts1 <-ts(hos$value_sum1, start =c(year(min(hos$date)), month(min(hos$date))), end =c(year(max(hos$date)), month(max(hos$date))), frequency =12)pe<-autoplot(hos_ts1, xlab ="Time", ylab ="Number of Inpatient Beds", colour ="#5a3196")+ggtitle('Time Series Plot of Number of Inpatient Beds in the US')+theme_bw()ggplotly(pe)
Code
# Visualize the plothos_ts2 <-ts(hos$value_sum2, start =c(year(min(hos$date)), month(min(hos$date))), end =c(year(max(hos$date)), month(max(hos$date))), frequency =12)pe<-autoplot(hos_ts2, xlab ="Time", ylab ="Number of Inpatient Beds Used for COVID", colour ="#5a3196")+ggtitle('Time Series Plot of Number of Inpatient Beds Used for COVID in the US')+theme_bw()ggplotly(pe)
Code
# Visualize the plothos_ts3 <-ts(hos$value_sum3, start =c(year(min(hos$date)), month(min(hos$date))), end =c(year(max(hos$date)), month(max(hos$date))), frequency =12)pe<-autoplot(hos_ts3, xlab ="Time", ylab ="Utilization Rate of Inpatient Beds for COVID", colour ="#5a3196")+ggtitle('Time Series Plot of Utilization Rate of Inpatient Beds for COVID in the US')+theme_bw()ggplotly(pe)
Code
gdp <-read_csv('Datasets/gdp.csv')# Convert DATE column from m/d/yy format to Date object and reformat to "Year" only for simplicitygdp$DATE <-format(mdy(gdp$DATE), "%Y/%m/%d")# Convert GDP column to numeric (floating-point) format if not alreadygdp$GDP <-as.numeric(gdp$GDP)# Visualize the plotstart_year <-year(min(gdp$DATE))start_month <-month(min(gdp$DATE))end_year <-year(max(gdp$DATE))end_month <-month(max(gdp$DATE))# Calculate the number of observations from start to endnum_obs <- (end_year - start_year) *4+ceiling((end_month - start_month) /3)# Create the time series objectgdp_ts <-ts(gdp$GDP, start =c(start_year, start_month), frequency =4, deltat =0.25)pe<-autoplot(gdp_ts, xlab ="Time", ylab ="GDP Per Capita", colour ="#5a3196")+ggtitle('Time Series Plot of GDP Per Capita by Year in US')+theme_bw()ggplotly(pe)
Code
unemp <-read_csv('Datasets/unemployment.csv')key_cols <-c("Location")value_cols <-setdiff(names(unemp), key_cols)unemp1 <-pivot_longer( unemp,cols = value_cols,names_to ="Time",values_to ="Unemployment")# Convert Time column to Date formatunemp1$Time <-as.Date(paste0(unemp1$Time, "-01"))# Convert Unemployment column to numeric (floating-point) formatunemp1$Unemployment <-as.numeric(unemp1$Unemployment)# Focus on USunemp2 <- unemp1[unemp1$Location =='United States',]# Visualize the plotemployees_ts <-ts(unemp2$Unemployment, start =c(year(min(unemp2$Time)), month(min(unemp2$Time))), end =c(year(max(unemp2$Time)), month(max(unemp2$Time))), frequency =12)pe<-autoplot(employees_ts, xlab ="Time", ylab ="Unemployment Rate", colour ="#5a3196")+ggtitle('Time Series Plot of Unemployment Rate by Month in US')+theme_bw()ggplotly(pe)
Code
# Set options to suppress warningsoptions("getSymbols.warning4.0"=FALSE)options("getSymbols.yahoo.warning"=FALSE)# Define the tickerstickers <-c("PFE")# Loop through tickers to get stock datafor (ticker in tickers) {getSymbols(ticker,from ="2020-01-01",to ="2024-01-01")}# Create a data frame with adjusted closing pricesstock <-data.frame(date =index(PFE), value =Ad(PFE))# Visualize the plotstock_ts <-ts(stock$PFE.Adjusted, start =c(year(min(stock$date)), month(min(stock$date))), end =c(year(max(stock$date)), month(max(stock$date))), frequency =12)pe<-autoplot(stock_ts, xlab ="Time", ylab ="Adjusted Price", colour ="#5a3196")+ggtitle('Time Series Plot of Pfizer Stock Price (2020-2024)')+theme_bw()ggplotly(pe)
Code
demo <-read_excel('Datasets/party.xlsx',sheet ='Democrat')inde <-read_excel('Datasets/party.xlsx',sheet ='Independent')rep <-read_excel('Datasets/party.xlsx',sheet ='Republican')# Transform the wide dataframe into a long dataframekey_cols <-c("Attitude")value_cols <-setdiff(names(demo), key_cols)demo1 <-pivot_longer( demo,cols = value_cols,names_to ="Time",values_to ="democrat")inde1 <-pivot_longer( inde,cols = value_cols,names_to ="Time",values_to ="independent")rep1 <-pivot_longer( rep,cols = value_cols,names_to ="Time",values_to ="republican")# Combine these three datasets togethercombined_data <-full_join(demo1, inde1, by =c("Time", "Attitude")) %>%full_join(rep1, by =c("Time", "Attitude"))combined_data1 <- combined_data[combined_data$Attitude=='Favorable',]# Define the key and value columns for pivotingkey_cols <-c("Attitude", "Time")value_cols <-setdiff(names(combined_data1), key_cols)# Pivot the data from wide to longcombined_data2 <-pivot_longer( combined_data1,cols = value_cols,names_to ="Party",values_to ="value")# Convert date column to Date formatcombined_data2$Time <-as.Date(combined_data2$Time)# Subset to each partydemo_data <- combined_data2[combined_data2$Party=='democrat',]inde_data <- combined_data2[combined_data2$Party=='independent',]rep_data <- combined_data2[combined_data2$Party=='republican',]# Visualize the plotgg <-ggplot(combined_data2, aes(x = Time, y = value, color = Party)) +geom_line() +labs(title ="Support Rate for Different Paties Over Time", x ="Time", y ="Percentage") +theme_minimal()plotly_gg <-ggplotly(gg)plotly_gg
Code
# Visualize the plotdemo_ts <-ts(demo_data$value, start =c(year(min(demo_data$Time)), month(min(demo_data$Time))), end =c(year(max(demo_data$Time)), month(max(demo_data$Time))), frequency =12)pe<-autoplot(demo_ts, xlab ="Time", ylab ="Percentage", colour ="#5a3196")+ggtitle('Time Series Plot of Support Rate for Democratic')+theme_bw()ggplotly(pe)
Code
# Visualize the plotinde_ts <-ts(inde_data$value, start =c(year(min(inde_data$Time)), month(min(inde_data$Time))), end =c(year(max(inde_data$Time)), month(max(inde_data$Time))), frequency =12)pe<-autoplot(inde_ts, xlab ="Time", ylab ="Percentage", colour ="#5a3196")+ggtitle('Time Series Plot of Support Rate for Independent')+theme_bw()ggplotly(pe)
Code
# Visualize the plotrep_ts <-ts(rep_data$value, start =c(year(min(rep_data$Time)), month(min(rep_data$Time))), end =c(year(max(rep_data$Time)), month(max(rep_data$Time))), frequency =12)pe<-autoplot(rep_ts, xlab ="Time", ylab ="Percentage", colour ="#5a3196")+ggtitle('Time Series Plot of Support Rate for Republican')+theme_bw()ggplotly(pe)
If the time series is additive, it follows this equation: \(Series = Trend + Seasonal + Random\)
If the time series is multiplicative, it follows this equation: \(Series = Trend * Seasonal * Random\)
Number of Daily Vaccinations Per Million: This metrics reflects the dynamic nature of our battle against the virus, with fluctuations influenced primarily by the evolving strains and their virulence. Noteworthy peaks occurred in April and October of 2021, possibly corresponding to strategic shifts in vaccination drives or response measures. In terms of seasonality, there is no clear sign of seasonality being present, therefore, we can conclude that there isn’t seasonality in the series based on unperceptive pattern. Moreover, the series is multiplicative.
Number of People Vaccinated Per Hundred: This metrics reveals a consistent upward trend over time. Notably, from January 2021 to July 2021, the rate of increase surged significantly compared to the period after July 2021, suggesting a pronounced acceleration in vaccination efforts during that timeframe and indicating a potential decline in the virus’s threat level. Seasonality analysis yields no discernible patterns, as vaccination rates are driven by individual choices rather than seasonal influences. Thus, the absence of any seasonal trends is evident. Moreover, the series is multiplicative.
Number of People Fully Vaccinated Per Hundred: This metrics is similar to the trend of the number of people vaccinated per hundred, which also reveals a consistent upward trend over time. Notably, from January 2021 to July 2021, the rate of increase surged significantly compared to the period after July 2021, suggesting a pronounced acceleration in vaccination efforts during that timeframe and indicating a potential decline in the virus’s threat level. Seasonality analysis yields no discernible patterns, as vaccination rates are driven by individual choices rather than seasonal influences. Thus, the absence of any seasonal trends is evident. Moreover, the series is multiplicative.
Number of Newly Confirmed Cases: This metric demonstrates a consistent upward trend over time, albeit with varying rates of increase observed across different periods. Notably, from October 2021 to February 2022, there was a notable acceleration in the rise of newly confirmed COVID-19 cases. In contrast, during other time periods, the pace of increase remained relatively steady. Despite thorough analysis, no discernible seasonal patterns emerge from this time series, suggesting that the incidence of confirmed cases is not influenced by seasonal factors. Moreover, the series is multiplicative.
Number of Death Cases: The trajectory of death cases is similar to that of newly confirmed cases, depicting an overall upward trend. Noteworthy spikes in the rate of fatalities occurred during specific intervals, especially during October 2020 to April 2021 and October 2021 to February 2022, suggesting heightened mortality rates during these periods. Despite analyzing seasonality, no discernible patterns emerge, indicating that mortality trends are not influenced by seasonal variations. Moreover, the series is multiplicative.
Number of Inpatient Beds: Initially, this series exhibited a sharp increase during the early stages of the COVID-19 pandemic. And starting from October 2020, the number of inpatient beds stabilized and reached a much higher number. The number occasionally fluctuated within a certain range after October 2020. Analysis of seasonality reveals no discernible patterns, indicating the absence of any seasonal influences on the series. Moreover, the series is multiplicative.
Number of Inpatient Beds Used for COVID: This metric exhibits significant fluctuations over time, with notable peaks observed in periods such as January 2021, September 2021, and January 2022, and nowadays the number of inpatient beds used for COVID became much lower. These peaks are not attributable to seasonal variations but rather correlate with surges in COVID-19 cases, often driven by the emergence of new virus variants. Consequently, the absence of any discernible seasonal patterns is evident in this time series. Moreover, the series is multiplicative.
Utilization Rate for Inpatient Beds Used for COVID: This metric mirrors the fluctuations observed in the number of inpatient beds occupied by COVID patients, with significant peaks evident during key periods such as January 2021, September 2021, and January 2022. Notably, recent data reflects a marked decline in COVID-related bed occupancy, stabilizing at approximately 5%. These fluctuations are not merely a result of seasonal variations but rather closely align with spikes in COVID-19 cases, often coinciding with the emergence of new virus variants. As such, the absence of discernible seasonal patterns underscores the dynamic nature of this metric, dictated by the evolving landscape of the pandemic. Moreover, the series is multiplicative.
GDP Per Capita: The trajectory of GDP per capita demonstrates a consistent upward trend, indicating economic growth and prosperity over time. Notably, this trend is not subject to seasonal fluctuations, as it is inherently tied to annual economic performance rather than periodic factors. Moreover, the series is multiplicative.
Unemployment Rate: The unemployment time series witnessed a sharp escalation in the early stages of the COVID-19 pandemic, particularly evident in March, as businesses grappled with widespread closures and economic uncertainty. However, in response to this crisis, governments worldwide swiftly implemented policies aimed at stimulating job creation and fostering economic recovery. Consequently, the unemployment rate began a gradual descent post-March 2020, signaling a positive trajectory towards stabilizing the job market. Presently, the unemployment rate hovers around 4%, indicative of a more stabilized employment landscape. Notably, similar to other pandemic-related metrics, the unemployment rate displays minimal seasonality, as its fluctuations are primarily driven by the virus’s impact on economic activity rather than seasonal patterns. Moreover, the series is multiplicative.
Pfizer Stock Price: The Pfizer stock price has exhibited significant fluctuations, with notable peaks occurring in October 2020 and February 2021. These spikes can largely be attributed to the successful development and widespread adoption of booster vaccinations during critical phases of the COVID-19 pandemic. The surge in stock value during these periods reflects investor confidence in Pfizer’s pivotal role in combating the virus. However, following March 2023, a discernible downward trajectory in stock price emerged, signaling a broader trend of decline within the medical sector during the post-pandemic era. Despite these fluctuations, the stock price demonstrates no clear seasonality, as its movements are intricately linked to the ebb and flow of the COVID-19 virus itself. Moreover, the series is multiplicative.
Party Support Rate: The party support rate exhibits continuous fluctuations, reflecting the dynamic nature of political sentiment. However, a prevailing trend emerges: the support rate for the Democratic party consistently surpasses that of independent and Republican factions. This enduring pattern underscores the enduring appeal and resonance of Democratic ideals among the populace. The absence of discernible seasonality in these fluctuations underscores that the shifts are driven primarily by the ever-evolving opinions and perceptions of the public rather than external factors tied to specific seasons or events. Party support rates in the US can vary significantly over time due to various factors such as political events, policy decisions, economic conditions, and societal changes. Moreover, the series is multiplicative.
gglagplot(d_vacc_ts, do.lines=FALSE) +xlab("Lags")+ylab("Daily Vaccinations per Million")+ggtitle("Lag Plot for Daily Vaccinations per Million")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(p_vacc_ts, do.lines=FALSE) +xlab("Lags")+ylab("People Vaccinated Per Hundred")+ggtitle("Lag Plot for People Vaccinated Per Hundred")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(pf_vacc_ts, do.lines=FALSE) +xlab("Lags")+ylab("People Fully Vaccinated Per Hundred")+ggtitle("Lag Plot for People Fully Vaccinated Per Hundred")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
gglagplot(dead_ts, do.lines=FALSE) +xlab("Lags")+ylab("Dead Cases")+ggtitle("Lag Plot for Dead Cases")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(hos_ts1, do.lines=FALSE) +xlab("Lags")+ylab("Number of Inpatient Beds")+ggtitle("Lag Plot for Number of Inpatient Beds")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(hos_ts2, do.lines=FALSE) +xlab("Lags")+ylab("Number of Inpatient Beds Used for COVID")+ggtitle("Lag Plot for Number of Inpatient Beds Used for COVID")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(hos_ts3, do.lines=FALSE) +xlab("Lags")+ylab("Utilization Rate of Inpatient Beds for COVID")+ggtitle("Lag Plot for Utilization Rate of Inpatient Beds for COVID")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(employees_ts, do.lines=FALSE) +xlab("Lags")+ylab("Unemployment Rate")+ggtitle("Lag Plot for Unemployment Rate")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(stock_ts, do.lines=FALSE) +xlab("Lags")+ylab("Adjusted Price")+ggtitle("Lag Plot for Adjusted Price of Pfizer")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(demo_ts, do.lines=FALSE) +xlab("Lags")+ylab("Percentage")+ggtitle("Lag Plot for Support Rate for Democratic")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(inde_ts, do.lines=FALSE) +xlab("Lags")+ylab("Percentage")+ggtitle("Lag Plot for Support Rate for Independent")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Code
gglagplot(rep_ts, do.lines=FALSE) +xlab("Lags")+ylab("Percentage")+ggtitle("Lag Plot for Support Rate for Republican")+theme(axis.text.x=element_text(angle=45, hjust=1)) +theme_bw()
Number of Daily Vaccinations Per Million: We observe a pronounced positive autocorrelation in the initial lag (lag 1), gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 10 through 16.
Number of People Vaccinated Per Hundred: We observe a pronounced positive autocorrelation in the first 3 lags, gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 7 through 16.
Number of People Fully Vaccinated Per Hundred: We observe a pronounced positive autocorrelation in the first 3 lags, gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 7 through 16.
Number of Newly Confirmed Cases: We observe a pronounced positive autocorrelation in the first 2 lags, gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 13 through 16.
Number of Death Cases: We observe a pronounced positive autocorrelation in the first 3 lags, gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 12 through 16.
Number of Inpatient Beds: We observe a positive autocorrelation within the first two lags, followed by a cessation of autocorrelation or the presence of exceedingly faint positive autocorrelation in subsequent lags.
Number of Inpatient Beds Used for COVID: We observe a positive autocorrelation within the initial lag (lag 1), followed by a cessation of autocorrelation or the presence of exceedingly faint positive autocorrelation in subsequent lags.
Utilization Rate for Inpatient Beds Used for COVID: We observe a positive autocorrelation within the initial lag (lag 1), followed by a cessation of autocorrelation or the presence of exceedingly faint positive autocorrelation in subsequent lags.
Unemployment Rate: We observe a positive autocorrelation within the first two lags, followed by a cessation of autocorrelation or the presence of exceedingly faint positive autocorrelation in subsequent lags.
Pfizer Stock Price: We observe a pronounced positive autocorrelation in the first 3 lags, gradually diminishing in strength as we progress through subsequent lags, until reaching a point where no discernible autocorrelation exists, or where any remaining autocorrelation is exceedingly weak, particularly evident in lags 9 through 16.
Support Rate for Democratic: We observe there is no significant autocorrelation across all lags. This absence suggests that there is no systematic relationship between the support rate for the Democratic party at one point in time and its support rate at subsequent points. In other words, fluctuations in support for the Democratic party appear to occur independently over time, without any discernible pattern of correlation or dependency between successive observations. This finding implies a lack of persistent trends or cyclical patterns in the support rate for the Democratic party, reflecting the stochastic nature of political opinion dynamics.
Support Rate for Independent: We observe there is no significant autocorrelation across all lags. This absence suggests that there is no systematic relationship between the support rate for the Independent party at one point in time and its support rate at subsequent points. In other words, fluctuations in support for the Independent party appear to occur independently over time, without any discernible pattern of correlation or dependency between successive observations. This finding implies a lack of persistent trends or cyclical patterns in the support rate for the Independent party, reflecting the stochastic nature of political opinion dynamics.
Support Rate for Republican: We observe there is no significant autocorrelation across all lags. This absence suggests that there is no systematic relationship between the support rate for the Republican party at one point in time and its support rate at subsequent points. In other words, fluctuations in support for the Republican party appear to occur independently over time, without any discernible pattern of correlation or dependency between successive observations. This finding implies a lack of persistent trends or cyclical patterns in the support rate for the Republican party, reflecting the stochastic nature of political opinion dynamics.
decomposed <-decompose(d_vacc_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Daily Vaccinations per Million")+theme_bw()
Code
decomposed <-decompose(p_vacc_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For People Vaccinated per Hundred")+theme_bw()
Code
decomposed <-decompose(pf_vacc_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For People Fully Vaccinated per Hundred")+theme_bw()
Code
decomposed <-decompose(case_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Newly Confirmed Cases")+theme_bw()
Code
decomposed <-decompose(dead_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Dead Cases")+theme_bw()
Code
decomposed <-decompose(hos_ts1, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Number of Inpatient Beds")+theme_bw()
Code
decomposed <-decompose(hos_ts2, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Number of Inpatient Beds Used for COVID")+theme_bw()
Code
decomposed <-decompose(hos_ts3, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Utilization Rate of Inpatient Beds for COVID")+theme_bw()
Code
decomposed <-decompose(employees_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Unemployment Rate")+theme_bw()
Code
decomposed <-decompose(stock_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Pfizer Stock Price")+theme_bw()
Code
decomposed <-decompose(demo_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Support Rate for Democratic")+theme_bw()
Code
decomposed <-decompose(inde_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Support Rate for Independent")+theme_bw()
Code
decomposed <-decompose(rep_ts, "multiplicative")autoplot(decomposed, colour ="#5a3196", main ="Decomposition Plot For Support Rate for Republican")+theme_bw()
The decomposition plots generated for each data series further substantiate the observations highlighted in Part 1 - Time Series Plots. Through meticulous analysis, these plots unravel the underlying patterns within our datasets, lending robust support to our initial claims. By dissecting the components of the time series—trend, seasonality, and residual—these visual representations not only corroborate our earlier findings but also enhance our understanding of the dynamics at play. This congruence between the decomposition plots and the initial time series observations serves as a compelling validation of our analytical framework, reinforcing the insights derived from our preliminary exploration.
d_vacc_acf <-ggAcf(d_vacc_ts)+ggtitle("ACF Plot for Daily Vaccinations per Million") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") d_vacc_pacf <-ggPacf(d_vacc_ts)+ggtitle("PACF Plot for Daily Vaccinations per Million") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(d_vacc_acf, d_vacc_pacf, nrow=2)
Code
p_vacc_acf <-ggAcf(p_vacc_ts)+ggtitle("ACF Plot for People Vaccinated per Hundred") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") p_vacc_pacf <-ggPacf(p_vacc_ts)+ggtitle("PACF Plot for People Vaccinated per Hundred") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(p_vacc_acf, p_vacc_pacf, nrow=2)
Code
pf_vacc_acf <-ggAcf(pf_vacc_ts)+ggtitle("ACF Plot for People Fully Vaccinated per Hundred") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") pf_vacc_pacf <-ggPacf(pf_vacc_ts)+ggtitle("PACF Plot for People Fully Vaccinated per Hundred") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(pf_vacc_acf, pf_vacc_pacf, nrow=2)
Code
case_acf <-ggAcf(case_ts)+ggtitle("ACF Plot for Newly Confirmed Cases") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") case_pacf <-ggPacf(case_ts)+ggtitle("PACF Plot for Newly Confirmed Cases") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(case_acf, case_pacf, nrow=2)
Code
dead_acf <-ggAcf(dead_ts)+ggtitle("ACF Plot for Dead Cases") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") dead_pacf <-ggPacf(dead_ts)+ggtitle("PACF Plot for Dead Cases") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(dead_acf, dead_pacf, nrow=2)
Code
hos1_acf <-ggAcf(hos_ts1)+ggtitle("ACF Plot for Number of Inpatient Beds") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") hos1_pacf <-ggPacf(hos_ts1)+ggtitle("PACF Plot for Number of Inpatient Beds") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(hos1_acf, hos1_pacf, nrow=2)
Code
hos2_acf <-ggAcf(hos_ts2)+ggtitle("ACF Plot for Number of Inpatient Beds Used for COVID") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") hos2_pacf <-ggPacf(hos_ts2)+ggtitle("PACF Plot for Number of Inpatient Beds Used for COVID") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(hos2_acf, hos2_pacf, nrow=2)
Code
hos3_acf <-ggAcf(hos_ts3)+ggtitle("ACF Plot for Utilization Rate of Inpatient Beds for COVID") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") hos3_pacf <-ggPacf(hos_ts3)+ggtitle("PACF Plot for Utilization Rate of Inpatient Beds for COVID") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(hos3_acf, hos3_pacf, nrow=2)
Code
emp_acf <-ggAcf(employees_ts)+ggtitle("ACF Plot for Unemployment Rate") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") emp_pacf <-ggPacf(employees_ts)+ggtitle("PACF Plot for Unemployment Rate") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(emp_acf, emp_pacf, nrow=2)
Code
stock_acf <-ggAcf(stock_ts)+ggtitle("ACF Plot for Pfizer Stock Price") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") stock_pacf <-ggPacf(stock_ts)+ggtitle("PACF Plot for Pfizer Stock Price") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(stock_acf, stock_pacf, nrow=2)
Code
demo_acf <-ggAcf(demo_ts)+ggtitle("ACF Plot for Support Rate for Democratic") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") demo_pacf <-ggPacf(demo_ts)+ggtitle("PACF Plot for Support Rate for Democratic") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(demo_acf, demo_pacf, nrow=2)
Code
inde_acf <-ggAcf(inde_ts)+ggtitle("ACF Plot for Support Rate for Independent") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") inde_pacf <-ggPacf(inde_ts)+ggtitle("PACF Plot for Support Rate for Independent") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(inde_acf, inde_pacf, nrow=2)
Code
rep_acf <-ggAcf(rep_ts)+ggtitle("ACF Plot for Support Rate for Republican") +theme_bw() +geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") rep_pacf <-ggPacf(rep_ts)+ggtitle("PACF Plot for Support Rate for Republican") +theme_bw()+geom_segment(lineend ="butt", color ="#5a3196") +geom_hline(yintercept =0, color ="#5a3196") grid.arrange(rep_acf, rep_pacf, nrow=2)
Number of Daily Vaccinations Per Million: ACF Plot has significant lags at 1 and 2 so p = 1, 2. PACF Plot has significant lags at 1 and 2 so q = 1, 2. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of People Vaccinated Per Hundred: ACF Plot has significant lags at 1-3 so p = 1, 2, 3. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of People Fully Vaccinated Per Hundred: ACF Plot has significant lags at 1-3 so p = 1, 2, 3. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of Newly Confirmed Cases: ACF Plot has significant lags at 1-10 so p = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10. However, in general we care about only the first couple of lags, in this case the first 3. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of Death Cases: ACF Plot has significant lags at 1-10 so p = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10. However, in general we care about only the first couple of lags, in this case the first 3. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of Inpatient Beds: ACF Plot has significant lags at 1 and 2 so p = 1, 2. PACF Plot has significant lags at 1 and 4 so q = 1, 4. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Number of Inpatient Beds Used for COVID: ACF Plot has significant lags at 1 and 2 so p = 1, 2. PACF Plot has significant lags at 1 and 2 so q = 1, 2. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Utilization Rate for Inpatient Beds Used for COVID: ACF Plot has significant lags at 1 so p = 1. PACF Plot has significant lags at 1-3 so q = 1, 2, 3. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Unemployment Rate: ACF Plot has significant lags at 1 so p = 1. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Pfizer Stock Price: ACF Plot has significant lags at 1-10 so p = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10. However, in general we care about only the first couple of lags, in this case the first 3. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Support Rate for Democratic: There are no lags over the dashed line in the ACF plot, which indicates that there is no significant autocorrelation in the series beyond the lag indicated by the highest peak. In this cases, the ACF plot suggests that there is no systematic relationship between the observations at different time points. This lack of autocorrelation implies that the series is likely stationary, as there is no discernible pattern of dependence between consecutive observations.
Support Rate for Independent: There are no lags over the dashed line in the ACF plot, which indicates that there is no significant autocorrelation in the series beyond the lag indicated by the highest peak. In this cases, the ACF plot suggests that there is no systematic relationship between the observations at different time points. This lack of autocorrelation implies that the series is likely stationary, as there is no discernible pattern of dependence between consecutive observations.
Support Rate for Republican: ACF Plot has significant lags at 1 so p = 1. PACF Plot has significant lags at 1 so q = 1. Regarding stationarity, the ACF plot reveals autocorrelation values surpassing the threshold represented by the dashed line. This observation indicates a lack of stationarity, as high autocorrelation signifies a series that exhibits dependence between consecutive observations. As such, our series demonstrates non-stationary behavior, as it retains significant autocorrelation across multiple lags.
Augmented Dickey-Fuller Test
data: pf_vacc_ts1
Dickey-Fuller = -7.4927, Lag order = 3, p-value = 0.01
alternative hypothesis: stationary
Code
tseries::adf.test(case_ts)
Augmented Dickey-Fuller Test
data: case_ts
Dickey-Fuller = -1.8178, Lag order = 3, p-value = 0.6457
alternative hypothesis: stationary
Code
tseries::adf.test(dead_ts)
Augmented Dickey-Fuller Test
data: dead_ts
Dickey-Fuller = -0.54605, Lag order = 3, p-value = 0.9754
alternative hypothesis: stationary
Code
tseries::adf.test(hos_ts1)
Augmented Dickey-Fuller Test
data: hos_ts1
Dickey-Fuller = -12.058, Lag order = 3, p-value = 0.01
alternative hypothesis: stationary
Code
tseries::adf.test(hos_ts2)
Augmented Dickey-Fuller Test
data: hos_ts2
Dickey-Fuller = -2.9428, Lag order = 3, p-value = 0.1968
alternative hypothesis: stationary
Code
tseries::adf.test(hos_ts3)
Augmented Dickey-Fuller Test
data: hos_ts3
Dickey-Fuller = -2.8936, Lag order = 3, p-value = 0.2165
alternative hypothesis: stationary
Code
tseries::adf.test(employees_ts)
Augmented Dickey-Fuller Test
data: employees_ts
Dickey-Fuller = -8.8759, Lag order = 2, p-value = 0.01
alternative hypothesis: stationary
Code
tseries::adf.test(stock_ts)
Augmented Dickey-Fuller Test
data: stock_ts
Dickey-Fuller = -3.0918, Lag order = 3, p-value = 0.1381
alternative hypothesis: stationary
Code
tseries::adf.test(demo_ts)
Augmented Dickey-Fuller Test
data: demo_ts
Dickey-Fuller = -3.4688, Lag order = 3, p-value = 0.05607
alternative hypothesis: stationary
Code
tseries::adf.test(inde_ts)
Augmented Dickey-Fuller Test
data: inde_ts
Dickey-Fuller = -4.2027, Lag order = 3, p-value = 0.01
alternative hypothesis: stationary
Code
tseries::adf.test(rep_ts)
Augmented Dickey-Fuller Test
data: rep_ts
Dickey-Fuller = -3.4937, Lag order = 3, p-value = 0.05222
alternative hypothesis: stationary
In our project, we delve into an array of statistical series to discern patterns and ascertain stationarity, crucial for understanding sentiment impacts on the stock prices of leading tech companies and broader socio-economic indicators. Our methodology employs rigorous statistical tests, complemented by Autocorrelation Function (ACF) plots, to scrutinize the data’s behavior over time.
Number of Daily Vaccinations Per Million: A p-value below 0.05 signals sufficient grounds to reject the null hypothesis at a 5% significance level, indicating stationarity in our series. This finding, however, contrasts with prior conclusions, suggesting the ACF plot’s superior accuracy, which points toward non-stationarity.
Number of People Vaccinated Per Hundred: The p-value, exceeding 0.05, reveals an insufficient basis to reject the null hypothesis, indicating a non-stationary series. This necessitates further modifications for stationarity, reinforcing conclusions from earlier analyses, including a significant lag order of 3.
Number of People Fully Vaccinated Per Hundred: With a p-value below 0.05, we find adequate evidence to reject the null hypothesis, suggesting stationarity. Yet, this contradicts previous findings, with the ACF plot indicating non-stationarity, challenging our initial conclusion.
Number of Newly Confirmed Cases: A p-value above 0.05 indicates a lack of sufficient evidence to dismiss the null hypothesis, suggesting non-stationarity. This aligns with earlier observations, necessitating adjustments for stationarity, including a noted lag order of 3.
Number of Death Cases: The p-value, again above 0.05, underscores a lack of adequate evidence to reject the null hypothesis, signaling a non-stationary series and the need for further data adjustments. This finding is consistent with prior analyses.
Number of Inpatient Beds: Here, a p-value below 0.05 provides enough justification to reject the null hypothesis, suggesting a stationary series. Nevertheless, this result is at odds with previous analyses, indicating non-stationarity based on the ACF plot.
Number of Inpatient Beds Used for COVID: The p-value surpassing 0.05 suggests insufficient evidence to reject the null hypothesis, pointing to a non-stationary series that requires adjustments, corroborating earlier findings and the significance of a lag order of 3.
Utilization Rate for Inpatient Beds Used for COVID: A high p-value indicates the series’ non-stationarity, echoing the need for adjustments to achieve stationarity and supporting earlier conclusions, including a lag order of 3.
Unemployment Rate: A low p-value indicates sufficient evidence to reject the null hypothesis, suggesting stationarity. However, this contrasts with previous examples, with the ACF plot indicating non-stationarity.
Pfizer Stock Price: With a p-value exceeding 0.05, there’s insufficient evidence to reject the null hypothesis, indicating a non-stationary series requiring adjustments, consistent with earlier findings, including a lag order of 3.
Support Rate for Democratic: A high p-value reveals a lack of evidence to reject the null hypothesis, suggesting non-stationarity and the need for adjustments, contradicting earlier conclusions of stationarity.
Support Rate for Independent: A low p-value provides ample evidence to reject the null hypothesis, indicating a stationary series. This finding aligns with prior conclusions, affirming the series’ stationarity.
Support Rate for Republican: The p-value, exceeding 0.05, indicates insufficient evidence to reject the null hypothesis, suggesting a non-stationary series that necessitates adjustments, in line with earlier analyses.
Through this detailed exploration, we meticulously gauge the stationarity of diverse series, juxtaposing statistical test results against ACF plot insights to draw nuanced conclusions on the dynamic interplay between sentiment, stock price movements, and broader socio-economic indicators.
Detrending and differencing stand as pivotal techniques in the realm of time series analysis, each aimed at achieving the crucial condition of stationarity within a dataset. While navigating the same goal of trend elimination, these methodologies diverge in their approach and application nuances.
Detrending is a targeted process aimed squarely at eradicating the underlying trend from the dataset. This is accomplished by first meticulously estimating the trend component that permeates the time series and then subtracting this estimated trend from the original dataset. The outcome is a transformed series where the original mean has been adjusted to center around zero, effectively neutralizing the trend influence. However, this transformation is not a panacea; detrended data can still exhibit non-stationary characteristics, such as seasonality or variance instabilities, that require further intervention.
Conversely, differencing operates under a broader scope, addressing stationarity by focusing on the differences between consecutive observations. This method is encapsulated by the formula:
\[\Delta y_t = y_t - y_{t-1}\]
where \(\Delta y_t\) represents the difference between the current observation \(y_t\) and its predecessor \(y_{t-1}\). Through this simple yet effective mechanism, differencing excels at mitigating linear trends and highlighting the dynamic changes between data points. Its strength lies particularly in contexts where the time series displays a consistent directional trend, making it a robust choice for such scenarios.
However, it’s worth noting that while differencing is adept at ironing out linear trends, it may falter when faced with nonlinear trends or pronounced seasonal fluctuations. The essence of differencing lies in its ability to simplify the series to a form where patterns and structures become more discernable, albeit at the potential cost of oversimplification in certain complex scenarios.
The decision to employ detrending or differencing hinges on a thorough examination of the time series at hand. The specific characteristics of the dataset, including the nature of its trends and seasonalities, dictate the most appropriate method for achieving stationarity. This choice is not merely technical but strategic, laying the foundation for deeper insights and more accurate forecasts in the pursuit of time series analysis.
# Put time series into dataframedf1 <-data.frame(Year =time(d_vacc_ts), Value =as.vector(d_vacc_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Daily Vaccination Number (Million)") +ggtitle("Daily Vaccination Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(p_vacc_ts), Value =as.vector(p_vacc_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("People Vaccinated Per Hundred") +ggtitle("People Vaccinated Per Hundred Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(pf_vacc_ts), Value =as.vector(pf_vacc_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("People Fully Vaccinated Per Hundred") +ggtitle("People Fully Vaccinated Per Hundred Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(case_ts), Value =as.vector(case_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("COVID Confirmed Case Number") +ggtitle("COVID Confirmed Case Number Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(dead_ts), Value =as.vector(dead_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("COVID Dead Case Number") +ggtitle("COVID Dead Case Number Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(hos_ts1), Value =as.vector(hos_ts1))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Inpatient Beds Number") +ggtitle("Inpatient Beds Number Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(hos_ts2), Value =as.vector(hos_ts2))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Inpatient Beds Used for COVID Number") +ggtitle("Inpatient Beds Used for COVID Number Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(hos_ts3), Value =as.vector(hos_ts3))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Utilization Rate") +ggtitle("Utilization Rate of Inpatient Beds for COVID Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(employees_ts), Value =as.vector(employees_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Unemployment Rate") +ggtitle("Unemployment Rate Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(stock_ts), Value =as.vector(stock_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Stock Price") +ggtitle("Pfizer Stock Price Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(demo_ts), Value =as.vector(demo_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Support Rate") +ggtitle("Support Rate for Democratics Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(inde_ts), Value =as.vector(inde_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Support Rate") +ggtitle("Support Rate for Independent Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
Code
# Put time series into dataframedf1 <-data.frame(Year =time(rep_ts), Value =as.vector(rep_ts))# Calculate moving averagema1 <- zoo::rollapply(df1$Value, 3, mean, fill =NA, align ="right")ma2 <- zoo::rollapply(df1$Value, 12, mean, fill =NA, align ="right")ma3 <- zoo::rollapply(df1$Value, 16, mean, fill =NA, align ="right")ma4 <- zoo::rollapply(df1$Value, 24, mean, fill =NA, align ="right")# Add moving averages to dfdf1$ma1 <- ma1df1$ma2 <- ma2df1$ma3 <- ma3df1$ma4 <- ma4# Create plot using ggplot2ggplot(df1, aes(x = Year, y = Value, color ="Data")) +geom_line() +geom_line(aes(y = ma1, color ="3 SMA")) +geom_line(aes(y = ma2, color ="12 SMA")) +geom_line(aes(y = ma3, color ="16 SMA")) +geom_line(aes(y = ma4, color ="24 SMA")) +xlab("Time") +ylab("Support Rate") +ggtitle("Support Rate for Republican Moving Average Smoothing") +scale_color_manual(name ="Type", values =c("Data"="gray", "3 SMA"="blue", "12 SMA"="orange", "16 SMA"="green", "24 SMA"="red")) +theme_bw()
To further refine and enrich the explanation of the use of moving averages in data analysis:
In our pursuit of deeper insights, we expanded our analytical toolkit by integrating four distinct moving average windows into the original dataset. This methodical selection spans from agile short-term perspectives to more deliberate long-term vistas. Specifically, we introduced a nimble 3-period moving average (3-MA) to catch immediate trends, alongside two medium-term averages of 12 periods (12-MA) and 16 periods (16-MA), culminating in a comprehensive 24-period moving average (24-MA) to gauge extended trends.
Upon thorough examination, the 12-MA distinctively stands out for its adeptness across various data series. While the 3-MA provides a glimpse into the data’s immediate direction, it often glosses over the dataset’s more impactful shifts. On the other end of the spectrum, the 16-MA and 24-MA, with their broader strokes, tend to dilute the dataset, masking subtle yet significant patterns, such as seasonal variations that are crucial for holistic analysis.
Remarkably, the 12-MA window adeptly navigates between these extremes. It offers a balanced lens through which meaningful data fluctuations are brought to the forefront, with just enough smoothing to clarify the analysis without erasing important details. This equilibrium is especially beneficial for analyzing spending data, characterized by its smoother fluctuations. Here, the slightly gentler touch of the 12-MA allows us to unearth and understand the underlying trends with greater precision.